library(tidyr)
library(dplyr)
library(jsonlite)
library(purrr)
library(stringr)
library(lubridate)
library(modelr)
library(broom)
library(tibble)
library(ggplot2)
library(mgcv)
Apro il file JSON, per come è fatto devo accedere al primo elemento e posso quindi caricarlo in un tibble. Sono 14.536 liste.
complete <- read_json("complete.json")
programs <- complete[[1]]
programsT <- tibble(programs)
remove(complete)
remove(programs)
A questo punto ho la possibilità di accedere ai programmi, controllo che siano tutte liste di lunghezza 6.
programsT %>%
mutate(programs_L = sapply(programs,length)) %>%
filter(programs_L != 6)
## # A tibble: 0 × 2
## # … with 2 variables: programs <list>, programs_L <int>
Posso quindi espandere i programmi.
programsT <- programsT %>% unnest_wider(programs)
programsT %>% distinct(programID)
## # A tibble: 14,531 × 1
## programID
## <chr>
## 1 3853
## 2 5178
## 3 10785
## 4 5887
## 5 305
## 6 3368
## 7 4226
## 8 5087
## 9 6310
## 10 1979
## # … with 14,521 more rows
programsT %>% select(programID) %>% n_distinct() # ci sono 14531 programmi
## [1] 14531
Nella maggior parte dei casi a un programma è associato un concerto, ma non è sempre così. In totale ci sono 22525 concerti.
programsT %>%
mutate(concerts_L=sapply(concerts,length)) %>%
count(concerts_L) %>%
mutate(concertN = concerts_L * n) %>%
mutate(concertsT = cumsum(concertN)) %>%
arrange(-concertsT)
## # A tibble: 11 × 4
## concerts_L n concertN concertsT
## <int> <int> <int> <int>
## 1 16 4 64 22525
## 2 12 4 48 22461
## 3 9 7 63 22413
## 4 8 2 16 22350
## 5 7 2 14 22334
## 6 6 5 30 22320
## 7 5 84 420 22290
## 8 4 1062 4248 21870
## 9 3 1033 3099 17622
## 10 2 2190 4380 14523
## 11 1 10143 10143 10143
Espando i concerti, ottengo una riga per ogni concerto. Passo da 14.536 a 22.525 righe. Aggiungo anche una colonna concertID. Passo a 7 colonne.
programsT <- programsT %>% unnest_longer(concerts)
nrow(programsT)
## [1] 22525
programsT <- rowid_to_column(programsT,var = "concertID")
programsT <- select(programsT,id,programID,orchestra,season,concertID,concerts,works)
Verifico che ogni concerto abbia 5 campi.
programsT %>%
mutate(concerts_L = sapply(concerts, length)) %>%
filter(concerts_L != 5)
## # A tibble: 0 × 8
## # … with 8 variables: id <chr>, programID <chr>, orchestra <chr>, season <chr>,
## # concertID <int>, concerts <list>, works <list>, concerts_L <int>
Posso espandere con unnest_wider. Allargo il tibble di 4 colonne, al posto di concerts ho eventType, Location, Venue, Date e Time.
programsT <- programsT %>% unnest_wider(concerts)
Come si può vedere, un programma può contenere più concerti.
programsT %>% group_by(programID) %>% count(sort = TRUE)
## # A tibble: 14,531 × 2
## # Groups: programID [14,531]
## programID n
## <chr> <int>
## 1 10700 16
## 2 10702 16
## 3 3128 16
## 4 3139 16
## 5 10701 12
## 6 10703 12
## 7 3134 12
## 8 3144 12
## 9 14385 9
## 10 14403 9
## # … with 14,521 more rows
Per esempio il programma 10700 contiene 16 concerti. In questo caso vediamo anche che di alcuni concerti non è indicata l’ora. Per questo motivo ho introdotto un concertID, se in seguito mi fossi limitato a identificare il concerto con eventType, Location, Venue, Date, Time, avrei perso alcuni concerti.
programsT %>%
filter(programID == 10700)
## # A tibble: 16 × 11
## id progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date Time works
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <list>
## 1 dbe4… 10700 New Yo… 1950-… 8525 Special Manhat… Roxy… 1951… 12:4… <list>
## 2 dbe4… 10700 New Yo… 1950-… 8526 Special Manhat… Roxy… 1951… None <list>
## 3 dbe4… 10700 New Yo… 1950-… 8527 Special Manhat… Roxy… 1951… None <list>
## 4 dbe4… 10700 New Yo… 1950-… 8528 Special Manhat… Roxy… 1951… None <list>
## 5 dbe4… 10700 New Yo… 1950-… 8529 Special Manhat… Roxy… 1951… 12:4… <list>
## 6 dbe4… 10700 New Yo… 1950-… 8530 Special Manhat… Roxy… 1951… None <list>
## 7 dbe4… 10700 New Yo… 1950-… 8531 Special Manhat… Roxy… 1951… None <list>
## 8 dbe4… 10700 New Yo… 1950-… 8532 Special Manhat… Roxy… 1951… None <list>
## 9 dbe4… 10700 New Yo… 1950-… 8533 Special Manhat… Roxy… 1951… 12:4… <list>
## 10 dbe4… 10700 New Yo… 1950-… 8534 Special Manhat… Roxy… 1951… None <list>
## 11 dbe4… 10700 New Yo… 1950-… 8535 Special Manhat… Roxy… 1951… None <list>
## 12 dbe4… 10700 New Yo… 1950-… 8536 Special Manhat… Roxy… 1951… None <list>
## 13 dbe4… 10700 New Yo… 1950-… 8537 Special Manhat… Roxy… 1951… None <list>
## 14 dbe4… 10700 New Yo… 1950-… 8538 Special Manhat… Roxy… 1951… None <list>
## 15 dbe4… 10700 New Yo… 1950-… 8539 Special Manhat… Roxy… 1951… None <list>
## 16 dbe4… 10700 New Yo… 1950-… 8540 Special Manhat… Roxy… 1951… 12:4… <list>
## # … with abbreviated variable names ¹programID, ²orchestra, ³concertID,
## # ⁴eventType, ⁵Location
Ogni programma contiene un certo numero di lavori (works). In totale ci sono 125.224 works (si tratta di works che possono essere ripetuti in diversi programs).
programsT %>%
mutate(works_L=sapply(works,length)) %>%
count(works_L) %>%
mutate(works_N = works_L * n) %>%
mutate(works_T = cumsum(works_N)) %>%
arrange(-works_T)
## # A tibble: 37 × 4
## works_L n works_N works_T
## <int> <int> <int> <int>
## 1 50 13 650 125224
## 2 48 4 192 124574
## 3 46 5 230 124382
## 4 41 1 41 124152
## 5 36 1 36 124111
## 6 32 4 128 124075
## 7 30 7 210 123947
## 8 29 2 58 123737
## 9 28 6 168 123679
## 10 27 7 189 123511
## # … with 27 more rows
Ci sono dei program che hanno 0 works, che cosa sono? Si tratta di eventi che non prevedono musica o dei quali mancano informazioni.
programsT %>%
mutate(works_L=sapply(works,length)) %>%
filter(works_L == 0)
## # A tibble: 29 × 12
## id progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date Time works
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <list>
## 1 2a85… 13816 New Yo… 1894-… 577 Subscr… Manhat… Metr… 1895… 8:00… <NULL>
## 2 21fa… 12310 Stadiu… 1934-… 5697 Stadiu… Manhat… Grea… 1935… 8:30… <NULL>
## 3 e76c… 10203 Member… 1943-… 7229 Studen… Manhat… Hunt… 1944… 3:30… <NULL>
## 4 1b6c… 8607 NYP Pr… 1998-… 17636 Chamber Manhat… Alic… 1998… 8:00… <NULL>
## 5 e91b… 13779 Musici… 2007-… 19411 Musici… Manhat… Hull… 2007… 6:30… <NULL>
## 6 7112… 13665 Musici… 2012-… 20558 Chambe… Manhat… Doub… 2012… 8:30… <NULL>
## 7 3b66… 13666 NYP Pr… 2012-… 20700 Chambe… Manhat… St. … 2013… 7:00… <NULL>
## 8 26fe… 13667 Musici… 2012-… 20704 Chambe… Manhat… Racq… 2013… None <NULL>
## 9 98f2… 13749 NYP Pr… 2013-… 20745 Chamber Manhat… The … 2013… 3:00… <NULL>
## 10 3dd4… 13341 NYP Pr… 2013-… 20767 Chambe… Manhat… None 2013… None <NULL>
## # … with 19 more rows, 1 more variable: works_L <int>, and abbreviated variable
## # names ¹programID, ²orchestra, ³concertID, ⁴eventType, ⁵Location
Posso utilizzare unnest_longer, ogni lavoro associato a un programma darà origine a una riga. Devo usare l’opzione keep_empty TRUE per non perdere le righe con zero works. Quindi il tibble passerà a 125.224 + 29 righe, ovvero 125.253.
programsT <- programsT %>% unnest_longer(works, keep_empty = TRUE)
programsT %>% filter(programID == 13665)
## # A tibble: 1 × 11
## id progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date Time works
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <list>
## 1 71124… 13665 Musici… 2012-… 20558 Chambe… Manhat… Doub… 2012… 8:30… <NULL>
## # … with abbreviated variable names ¹programID, ²orchestra, ³concertID,
## # ⁴eventType, ⁵Location
Ora works contiene liste di lunghezze diverse, a seconda delle caratteristiche del work compreso.
programsT %>%
mutate(works_L=sapply(works,length)) %>%
count(works_L)
## # A tibble: 5 × 2
## works_L n
## <int> <int>
## 1 0 29
## 2 3 18489
## 3 4 5292
## 4 5 68041
## 5 6 33402
Le liste di lunghezza 3 sono quelle che contengono le intermissions.
programsT %>%
mutate(works_L = sapply(works,length)) %>%
filter(works_L == 3) %>%
select(works) %>%
unnest_wider(works) %>%
count(ID, interval, soloists)
## # A tibble: 4 × 4
## ID interval soloists n
## <chr> <chr> <lgl> <int>
## 1 0* Intermission NA 18394
## 2 1743* Intermission-Third NA 1
## 3 4346* Intermission-Short NA 54
## 4 7955* Intermission-Second NA 40
Cerco di capire cosa contengono le altre liste (di lunghezza 4, 5 e 6).
programsT %>%
mutate(works_L = sapply(works,length)) %>%
filter(works_L == 4) %>%
select(works) %>%
unnest_wider(works)
## # A tibble: 5,292 × 4
## ID composerName workTitle soloists
## <chr> <chr> <list> <list>
## 1 3642* Hummel, Johann <chr [1]> <list [5]>
## 2 52425* Beethoven, Ludwig van <chr [1]> <list [7]>
## 3 3712* Bochsa, Robert N. C. <chr [1]> <list [2]>
## 4 3764* Lindpaintner, Peter Von <chr [1]> <list [5]>
## 5 3712* Bochsa, Robert N. C. <chr [1]> <list [2]>
## 6 3764* Lindpaintner, Peter Von <chr [1]> <list [5]>
## 7 3894* Traditional, <chr [1]> <list [2]>
## 8 3468* Haydn, Franz Joseph <chr [1]> <list [1]>
## 9 3991* Donizetti, Gaetano <chr [1]> <list [2]>
## 10 3999* Hummel, Johann <chr [1]> <list [7]>
## # … with 5,282 more rows
programsT %>%
mutate(works_L = sapply(works,length)) %>%
filter(works_L == 5) %>%
select(works) %>%
unnest_wider(works)
## # A tibble: 68,041 × 6
## ID composerName workTitle conductorName soloi…¹ movem…²
## <chr> <chr> <list> <chr> <list> <list>
## 1 52446* Beethoven, Ludwig van <chr [1]> Hill, Ureli Corel… <NULL> <NULL>
## 2 5543* Kalliwoda, Johann W. <chr [1]> Timm, Henry C. <NULL> <NULL>
## 3 52437* Beethoven, Ludwig van <chr [1]> Hill, Ureli Corel… <NULL> <NULL>
## 4 3659* Romberg, Bernhard <chr [1]> Hill, Ureli Corel… <list> <NULL>
## 5 4567* Hummel, Johann <chr [1]> Hill, Ureli Corel… <list> <NULL>
## 6 5150* Pacini, Giovanni <chr [1]> Not conducted <list> <NULL>
## 7 5161* Romberg, Bernhard <chr [1]> Not conducted <list> <NULL>
## 8 5166* Thalberg, Sigismond <chr [1]> Not conducted <list> <NULL>
## 9 5172* Herz, Henri <chr [1]> Alpers, William <list> <NULL>
## 10 5174* Lindpaintner, Peter Von <chr [1]> Not conducted <list> <NULL>
## # … with 68,031 more rows, and abbreviated variable names ¹soloists, ²movement
programsT %>%
mutate(works_L = sapply(works,length)) %>%
filter(works_L == 6) %>%
select(works) %>%
unnest_wider(works)
## # A tibble: 33,402 × 6
## ID composerName workTitle movem…¹ condu…² soloi…³
## <chr> <chr> <chr> <list> <chr> <list>
## 1 8834*4 Weber, Carl Maria Von OBERON <chr> Timm, … <list>
## 2 8834*3 Weber, Carl Maria Von OBERON <chr> Etienn… <NULL>
## 3 8835*1 Rossini, Gioachino ARMIDA <chr> Timm, … <list>
## 4 8837*6 Beethoven, Ludwig van FIDELIO, OP. 72 <chr> Timm, … <list>
## 5 8336*4 Mozart, Wolfgang Amadeus ABDUCTION FROM TH… <chr> Timm, … <list>
## 6 8838*2 Bellini, Vincenzo I PURITANI <chr> Hill, … <list>
## 7 8839*2 Rossini, Gioachino WILLIAM TELL <chr> Alpers… <NULL>
## 8 53076*2 Rossini, Gioachino STABAT MATER <chr> Alpers… <list>
## 9 51568*2 Hummel, Johann CONCERTO, PIANO, … <chr> Alpers… <list>
## 10 51568*3 Hummel, Johann CONCERTO, PIANO, … <chr> Alpers… <list>
## # … with 33,392 more rows, and abbreviated variable names ¹movement,
## # ²conductorName, ³soloists
Faccio unnest_wider di works. Passo da 11 colonne a 17 (al posto di works si aggiungono tutte le colonne comuni ai diversi works, ovvero ID, composerName, workTitle, movement, conductorName, soloists, interval)
programsT <- programsT %>% unnest_wider(works)
Passo a prendere in considerazione workTitle.
programsT %>%
mutate(workTitle_L = sapply(workTitle,length)) %>%
count(workTitle_L)
## # A tibble: 3 × 2
## workTitle_L n
## <int> <int>
## 1 0 18518
## 2 1 106723
## 3 2 12
Come sono gli elementi di lunghezza 2.
programsT %>%
mutate(workTitle_L = sapply(workTitle, length)) %>%
filter(workTitle_L == 2) %>%
pull(workTitle) %>%
head(3)
## [[1]]
## [[1]]$`_`
## [1] ", PROCESSION OF THE KNIGHTS OF THE HOLY GRAIL"
##
## [[1]]$em
## [1] "PARSIFAL"
##
##
## [[2]]
## [[2]]$`_`
## [1] "CHORUS OF VILLAGERS FROM (ARR. CLARINET ENS.) (ARR. Bellison)"
##
## [[2]]$em
## [1] "PRINCE IGOR"
##
##
## [[3]]
## [[3]]$`_`
## [1] "AIR FROM (ARR. CLARINET ENS.) (ARR. Bellison)"
##
## [[3]]$em
## [1] "SUITE IN D MAJOR"
Mi salvo i program ID per ritrovarli dopo la modifica.
wt2ids <- programsT %>%
mutate(workTitle_L = sapply(workTitle, length)) %>%
filter(workTitle_L == 2) %>%
select(ID)
Applico funzione per sistemarli (sistemo anche NA al posto di NULL), faccio semi_join con gli id per verificare il risultato su quelli di lunghezza 2.
mod_w <- function(x) {
if (is.null(x)) return (NA)
if (length(x) == 1) return (x[[1]])
if (length(x) == 2) return (combine_w(x[[1]], x[[2]]))
}
combine_w <- function(x,y) {
if (startsWith(x,",")) {
return (paste(y,x))
} else {
return (paste(x,y))
}
}
programsT %>%
mutate(workTitle = sapply(workTitle, mod_w)) %>%
semi_join(wt2ids) %>%
select(workTitle)
## Joining with `by = join_by(ID)`
## # A tibble: 12 × 1
## workTitle
## <chr>
## 1 PARSIFAL , PROCESSION OF THE KNIGHTS OF THE HOLY GRAIL
## 2 CHORUS OF VILLAGERS FROM (ARR. CLARINET ENS.) (ARR. Bellison) PRINCE IGOR
## 3 AIR FROM (ARR. CLARINET ENS.) (ARR. Bellison) SUITE IN D MAJOR
## 4 ANDANTE FROM (ARR. CLARINET ENS.) (ARR. Bellison) SURPRISE SYMPHONY
## 5 OVERTURE, DI BALLO
## 6 OVERTURE, DI BALLO
## 7 OVERTURE, DI BALLO
## 8 QUARTET, STRING, OP. 76, NO. 2, D MINOR, H.III:76, FIFTHS
## 9 CONCERT FANTASY ON FOR VIOLIN AND PIANO (ARR. Sarasate) CARMEN
## 10 QUARTET, STRING, OP. 76, NO. 2, D MINOR, H.III:76, FIFTHS
## 11 QUARTET, STRING, OP. 76, NO. 2, D MINOR, H.III:76, FIFTHS
## 12 QUARTET, STRING, OP. 76, NO. 2, D MINOR, H.III:76, FIFTHS
Eseguo l’operazione e ri-assegno a programsT.
programsT <- programsT %>%
mutate(workTitle = sapply(workTitle, mod_w))
Verifico la lunghezza degli elementi contenuti in movement.
programsT %>%
mutate(movement_L = sapply(movement, length)) %>%
count(movement_L)
## # A tibble: 3 × 2
## movement_L n
## <int> <int>
## 1 0 89759
## 2 1 35346
## 3 2 148
Vedo come sono quelli di lunghezza 2.
programsT %>%
mutate(movement_L = sapply(movement, length)) %>%
filter(movement_L == 2)
## # A tibble: 148 × 18
## id progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date Time ID
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 6a02e… 1907 New Yo… 1849-… 33 Subscr… Manhat… Apol… 1850… 8:00… 5801…
## 2 6a02e… 1907 New Yo… 1849-… 33 Subscr… Manhat… Apol… 1850… 8:00… 5801…
## 3 6f3bc… 11474 Musici… 1876-… 172 Special Manhat… Stei… 1876… 8:00… 6529…
## 4 d3eff… 871 New Yo… 1879-… 228 Subscr… Manhat… Acad… 1880… 8:00… 5801…
## 5 d3eff… 871 New Yo… 1879-… 228 Subscr… Manhat… Acad… 1880… 8:00… 5801…
## 6 d4145… 9023 New Yo… 1898-… 695 Young … Manhat… Carn… 1899… 2:30… 6529…
## 7 09a41… 10541 New Yo… 1903-… 908 Young … Manhat… Carn… 1904… None 6529…
## 8 59816… 10512 New Yo… 1905-… 987 Subscr… Manhat… Carn… 1905… 2:30… 1063…
## 9 604c7… 10440 New Yo… 1913-… 1688 Young … Manhat… Carn… 1914… None 1050…
## 10 604c7… 10440 New Yo… 1913-… 1688 Young … Manhat… Carn… 1914… None 1050…
## # … with 138 more rows, 7 more variables: composerName <chr>, workTitle <chr>,
## # conductorName <chr>, soloists <list>, movement <list>, interval <chr>,
## # movement_L <int>, and abbreviated variable names ¹programID, ²orchestra,
## # ³concertID, ⁴eventType, ⁵Location
Mi limito a concatenarli, con la funzione mod_m. Inoltre sostituisco i NULL con NA.
mod_m <- function(x) {
if (is.null(x)) return (NA)
if (length(x) == 1) return (x[[1]])
if (length(x) == 2) return (paste(x[[1]], "_", x[[2]]))
}
programsT <- programsT %>% mutate(movement = sapply(movement, mod_m))
Controllo la lunghezza delle liste contenute in soloists. Ho 89.321 potenziali righe di informazioni su solisti + 85.122 righe nulle.
programsT %>%
mutate(soloists_L = sapply(soloists, length)) %>%
count(soloists_L) %>%
mutate(soloists_N = soloists_L * n) %>%
mutate(soloists_T = cumsum(soloists_N)) %>%
filter(soloists_L == 0 | soloists_L == max(soloists_L))
## # A tibble: 2 × 4
## soloists_L n soloists_N soloists_T
## <int> <int> <int> <int>
## 1 0 85122 0 0
## 2 84 1 84 89321
Provo unnest_longer di soloists con opzione keep_empty per tenere anche le righe che contengono liste NULL e vedo che sono di lunghezza 0 o 3.
programsT %>%
unnest_longer(soloists, keep_empty = TRUE) %>%
mutate(soloists_L = sapply(soloists, length)) %>%
count(soloists_L)
## # A tibble: 2 × 2
## soloists_L n
## <int> <int>
## 1 0 85127
## 2 3 89316
Ce ne sono cinque che sono di lunghezza 1, ma poi NULL.
programsT %>%
mutate(soloists_L = sapply(soloists, length)) %>%
filter(soloists_L != 0) %>%
unnest_longer(soloists, keep_empty = TRUE) %>%
mutate(soloists_L2 = sapply(soloists, length)) %>%
filter(soloists_L2 == 0)
## # A tibble: 5 × 19
## id progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date Time ID
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 0afec4… 14489 Musici… 2019-… 22157 Virtua… Online… YouT… 2020… 7:40… 6979*
## 2 b64d6e… 14552 Musici… 2019-… 22165 Virtua… Online… YouT… 2020… 1:00… 5276…
## 3 23ee98… 14586 Musici… 2019-… 22171 Virtua… Online… Inst… 2020… 1:00… 1319…
## 4 70f296… 14477 Musici… 2019-… 22178 Virtua… Online… YouT… 2020… 1:20… 1320…
## 5 398d6d… 14470 Musici… 2019-… 22181 Virtua… Online… YouT… 2020… 10:4… 5007…
## # … with 8 more variables: composerName <chr>, workTitle <chr>,
## # conductorName <chr>, soloists <list>, movement <chr>, interval <chr>,
## # soloists_L <int>, soloists_L2 <int>, and abbreviated variable names
## # ¹programID, ²orchestra, ³concertID, ⁴eventType, ⁵Location
Per esempio.
programsT %>%
filter(programID == 14489) %>%
pull(soloists)
## [[1]]
## [[1]][[1]]
## NULL
Faccio quindi unnest_longer, poi unnest_wider, il numero di colonne passa da 17 a 19 (soloistName, soloistInstrument e soloistRoles al posto di soloists).
programsT <- programsT %>%
unnest_longer(soloists, keep_empty = TRUE) %>%
unnest_wider(soloists)
programsT
## # A tibble: 174,443 × 19
## id progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date Time ID
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 5244…
## 2 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 8834…
## 3 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 3642*
## 4 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 3642*
## 5 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 3642*
## 6 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 3642*
## 7 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 3642*
## 8 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 0*
## 9 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 8834…
## 10 00646… 3853 New Yo… 1842-… 1 Subscr… Manhat… Apol… 1842… 8:00… 8835…
## # … with 174,433 more rows, 8 more variables: composerName <chr>,
## # workTitle <chr>, conductorName <chr>, soloistName <chr>,
## # soloistInstrument <chr>, soloistRoles <chr>, movement <chr>,
## # interval <chr>, and abbreviated variable names ¹programID, ²orchestra,
## # ³concertID, ⁴eventType, ⁵Location
id rappresenta il GUID, che fa parte dell’indirizzo a cui è possibile vedere il programma online (archives.nyphil.org/index.php/artifact/GUID/fullview). ci sono tre programID che sono associati a più GUID. Tengo comunque la colonna id.
ids <- programsT %>%
distinct(id, programID) %>%
group_by(programID) %>%
count(sort = TRUE) %>%
filter(n>1) %>%
select(programID) %>%
ungroup()
semi_join(programsT, ids) %>%
distinct(id, programID) %>%
select(id, programID)
## Joining with `by = join_by(programID)`
## # A tibble: 8 × 2
## id programID
## <chr> <chr>
## 1 af869073-9643-4dab-b182-948e3f2e6ab9-0.1 8950
## 2 f4cf3522-7910-4e90-9978-ab70ac615ae4-0.1 8950
## 3 b31c88f2-1774-4f8b-94cf-835952c65175-0.1 8950
## 4 38283dce-9333-46c4-828a-54c0ad957c7b-0.1 10525
## 5 ee1431f7-6f25-4e5f-b815-4be9d743cf03-0.1 10525
## 6 8a73c52f-95a8-411c-b6e8-3d7bc4d7b90a-0.1 10525
## 7 f4fd303d-46c7-4233-b087-0d2f5f91cc7b-0.1 5358
## 8 b533775d-c639-4f79-9838-f88a88f79e95-0.1 5358
Correggo come è scritta la data, aggiungo anche componenti anno, mese, giorno della data.
#ci sono 5591 righe che contengono None invece dell'orario in formato AM o PM
programsT %>%
mutate(trovato = str_detect(programsT$Time, "\\d{1,2}:\\d{1,2}[P,A]M")) %>%
filter(trovato == FALSE)
## # A tibble: 5,591 × 20
## id progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date Time ID
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 09581… 305 New Yo… 1843-… 5 Subscr… Manhat… Apol… 1843… None 5245…
## 2 09581… 305 New Yo… 1843-… 5 Subscr… Manhat… Apol… 1843… None 3677…
## 3 09581… 305 New Yo… 1843-… 5 Subscr… Manhat… Apol… 1843… None 0*
## 4 09581… 305 New Yo… 1843-… 5 Subscr… Manhat… Apol… 1843… None 8955…
## 5 09581… 305 New Yo… 1843-… 5 Subscr… Manhat… Apol… 1843… None 5190…
## 6 09581… 305 New Yo… 1843-… 5 Subscr… Manhat… Apol… 1843… None 5072…
## 7 8025e… 4226 New Yo… 1843-… 7 Subscr… Manhat… Apol… 1844… None 3707*
## 8 8025e… 4226 New Yo… 1843-… 7 Subscr… Manhat… Apol… 1844… None 3712*
## 9 8025e… 4226 New Yo… 1843-… 7 Subscr… Manhat… Apol… 1844… None 3712*
## 10 8025e… 4226 New Yo… 1843-… 7 Subscr… Manhat… Apol… 1844… None 0*
## # … with 5,581 more rows, 9 more variables: composerName <chr>,
## # workTitle <chr>, conductorName <chr>, soloistName <chr>,
## # soloistInstrument <chr>, soloistRoles <chr>, movement <chr>,
## # interval <chr>, trovato <lgl>, and abbreviated variable names ¹programID,
## # ²orchestra, ³concertID, ⁴eventType, ⁵Location
# lascio Time com'è, sistemo la data, tenendo anche anno, mese, giorno
programsT <- programsT %>%
mutate(Date = str_extract(Date, "[[:digit:]-]+")) %>%
mutate(Date = ymd(Date)) %>%
mutate(Year = year(Date)) %>%
mutate(Month = month(Date)) %>%
mutate(Day = day(Date)) %>%
select(id, programID, orchestra, season, concertID, eventType, Location, Venue, Date, Year, Month, Day, Time, ID, composerName, workTitle, conductorName, soloistName, soloistInstrument, soloistRoles, movement, interval)
Divido ID in workID e movementID
programsT <- programsT %>%
separate(ID, into = c("workID", "movID"), sep = "\\*") %>%
mutate(workID = as.integer(workID)) %>%
mutate(movID = as.integer(movID))
Ci sono delle celle con valori multipli in conductorName.
programsT %>%
mutate(co_conductor = str_detect(conductorName, ";")) %>%
filter(co_conductor)
## # A tibble: 502 × 24
## id progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date Year
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <date> <dbl>
## 1 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 2 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 3 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 4 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 5 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 6 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 7 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 8 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 9 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## 10 1b7479… 10581 New Yo… 1847-… 24 Special Manhat… Cast… 1848-02-05 1848
## # … with 492 more rows, 14 more variables: Month <dbl>, Day <int>, Time <chr>,
## # workID <int>, movID <int>, composerName <chr>, workTitle <chr>,
## # conductorName <chr>, soloistName <chr>, soloistInstrument <chr>,
## # soloistRoles <chr>, movement <chr>, interval <chr>, co_conductor <lgl>, and
## # abbreviated variable names ¹programID, ²orchestra, ³concertID, ⁴eventType,
## # ⁵Location
Correggo un errore (un conductorName scritto come “de Waart, Edo; ; ; de Waart, Edo”).
programsT <- programsT %>%
mutate(conductorName = if_else(str_detect(conductorName, ";\\s*;\\s*;"), "de Waart, Edo", conductorName))
Correggo errori nei nomi dei compositori.
programsT <- programsT %>%
mutate(composerName = str_replace_all(composerName, "\\s+", " "))
Modifico aggiungendo una colonna che indica la presenza di co-conduzione.
programsT <- programsT %>%
mutate(conductorName = str_remove(conductorName, "^; ")) %>%
mutate(co_conductor = str_detect(conductorName, ";")) %>%
separate_longer_delim(conductorName, "; ")
Costruisco un tibble ridotto per le analisi successive.
performances <- programsT %>%
filter(is.na(interval)) %>%
distinct(concertID, Date, composerName, workTitle, conductorName, co_conductor, orchestra, eventType, Location, season) %>%
mutate(seasonYear = sapply(season,function (x) as.integer(str_extract(x, "[[:digit:]]+")))) %>%
ungroup()
remove(programsT)
Orchestre presenti nel database.
ggplot(performances) +
geom_bar(aes(y = orchestra))
Orchestre nel tempo.
ggplot(performances) +
geom_point(aes(seasonYear, orchestra))
Andamento numero performance nelle stagioni.
performances %>%
group_by(seasonYear) %>%
count() %>%
ggplot(mapping = aes(x = seasonYear, y = n)) +
geom_point()
Che tipi di performance ci sono. Mi limito alle categorie che contengono più di 100 performance.
performances %>%
group_by(eventType) %>%
count() %>%
filter(n > 100) %>%
ggplot(aes(x = n, y = eventType)) +
geom_bar(stat = "identity")
Tipologie eventi New York Philharmonic.
performances %>%
filter(orchestra == "New York Philharmonic") %>%
mutate(group = if_else(eventType == "Subscription Season", "SUB", "REST")) %>%
group_by(seasonYear,group) %>%
count() %>%
ggplot(aes(x = seasonYear, y = n, colour = group)) +
geom_point()
Quali sono gli eventType preponderanti dalla stagione 2000-2001 in poi non di tipo Subscription Season?
performances %>%
filter(orchestra == "New York Philharmonic") %>%
filter(seasonYear >= 2000) %>%
filter(eventType != "Subscription Season") %>%
group_by(seasonYear,eventType) %>%
count() %>%
ggplot(aes(x = seasonYear, y = n, color = eventType)) +
geom_line(alpha = 1/2, show.legend = FALSE)
Fra gli eventi che non fanno parte della stagione regolare, quelli presenti con maggior continuità sono tour e concerti non-subscription.
performances %>%
filter(orchestra == "New York Philharmonic") %>%
filter(seasonYear >= 2000) %>%
filter(eventType != "Subscription Season") %>%
ggplot(aes(y=eventType)) +
geom_bar()
performances %>%
filter(orchestra == "New York Philharmonic") %>%
filter(seasonYear >= 2000) %>%
filter(eventType != "Subscription Season") %>%
group_by(seasonYear,eventType) %>%
count() %>%
group_by(seasonYear) %>%
mutate(seasonEvents = sum(n)) %>%
ungroup() %>%
mutate(perc = n / seasonEvents) %>%
group_by(seasonYear) %>%
filter(perc > 0.2) %>%
ungroup() %>%
ggplot() +
geom_point(aes(seasonYear, perc, color = eventType)) +
geom_line(aes(seasonYear, perc, color = eventType))
performances %>%
filter(orchestra == "New York Philharmonic") %>%
filter(seasonYear >= 2000) %>%
filter(eventType != "Subscription Season") %>%
group_by(seasonYear,eventType) %>%
count() %>%
group_by(seasonYear) %>%
mutate(seasonEvents = sum(n)) %>%
ungroup() %>%
mutate(perc = n / seasonEvents) %>%
group_by(seasonYear) %>%
filter(eventType == "Bandwagon")
## # A tibble: 1 × 5
## # Groups: seasonYear [1]
## seasonYear eventType n seasonEvents perc
## <int> <chr> <int> <int> <dbl>
## 1 2020 Bandwagon 47 131 0.359
Da ora in poi mi limito a considerare New York Philharmonic. 69759 performance.
performances <- performances %>%
filter(orchestra == "New York Philharmonic")
Si possono trovare dei trend che ci permettano di dire qualcosa sulla popolarità di alcuni compositori nel corso del tempo? Qui sotto un grafico con stagioni - numero di performance per compositore.
pcp <- performances %>%
group_by(seasonYear, composerName) %>%
count() %>%
group_by(seasonYear) %>%
mutate(totalSeason = sum(n)) %>%
ungroup() %>%
mutate(perc = n / totalSeason)
saveRDS(pcp, file = "pcp.RDS")
ggplot(pcp,aes(seasonYear, n, color = composerName)) +
geom_line(alpha = 1/4, show.legend = FALSE)
Questo con le percentuali. Per ogni compositore, la percentuale di performance del compositore rispetto al totale della stagione.
ggplot(pcp,aes(seasonYear, perc, color = composerName)) +
geom_line(alpha = 1/4, show.legend = FALSE)
Costruisco un modello quadratico per tutti i compositori (dipendente percentuale, indipendenti stagione e stagione al quadrato).
pcp <- pcp %>%
mutate(seasonYear_2 = seasonYear**2)
pcp_nested <- pcp %>%
group_by(composerName) %>%
nest()
pcp_model <- function(df) {
lm(perc ~ seasonYear + seasonYear_2, data = df)
}
pcp_nested <- pcp_nested %>%
mutate(model = map(data, pcp_model))
pcp_nested <- pcp_nested %>%
mutate(data = map2(data, model, add_residuals))
## Warning: There were 1354 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `data = map2(data, model, add_residuals)`.
## ℹ In group 1: `composerName = "ACT,"`.
## Caused by warning in `predict.lm()`:
## ! prediction from a rank-deficient fit may be misleading
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 1353 remaining warnings.
pcp_nested <- pcp_nested %>%
mutate(glance = map(model, broom::glance))
pcp_models_simp <- pcp_nested %>%
mutate(model = map(model, tidy)) %>%
unnest_longer(model) %>%
unnest_wider(model) %>%
select(composerName,term,estimate) %>%
pivot_wider(names_from = term,values_from = estimate)
saveRDS(pcp_models_simp, file = "pcp_models_simp.RDS")
pcp_glance <- unnest(pcp_nested, glance, names_sep = "_")
saveRDS(pcp_glance, file = "pcp_glance.RDS")
pcp_resids <- unnest(pcp_nested, data)
saveRDS(pcp_resids, file = "pcp_resids.RDS")
Compositori meno di 26 stagioni.
atleast26 <- performances %>%
group_by(composerName) %>%
summarise(seasons = n_distinct(seasonYear)) %>%
filter(seasons <= 25)
Modelli in cui tutte e due le componenti sono positive. Non ci sono.
up_up <- pcp_models_simp %>%
filter(seasonYear > 0, seasonYear_2 > 0) %>%
select(composerName) %>%
head(20)
Modelli in cui tutte e due le componenti sono negative. Non ci sono.
down_down <- pcp_models_simp %>%
filter(seasonYear<0,seasonYear_2<0) %>%
select(composerName) %>%
head(20)
Modelli in cui la componente quadratica è positiva e la componente lineare è negativa. Compositori con almeno 26 stagioni.
down_up <- pcp_models_simp %>%
filter(seasonYear<0,seasonYear_2>0) %>%
select(composerName)
pcp %>%
anti_join(atleast26) %>%
semi_join(down_up) %>%
ggplot(aes(x = seasonYear, y = perc)) +
geom_point(alpha = 1/4) +
facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`
## Joining with `by = join_by(composerName)`
Modelli in cui la componente quadratica è negativa e in cui la componente lineare è positiva. Compositori con almeno 26 stagioni.
up_down <- pcp_models_simp %>%
filter(seasonYear>0,seasonYear_2<0) %>%
select(composerName)
pcp %>%
anti_join(atleast26) %>%
semi_join(up_down) %>%
ggplot(aes(x = seasonYear, y = perc)) +
geom_point(alpha = 1/4) +
facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`
## Joining with `by = join_by(composerName)`
Residui.
pcp_resids %>%
ggplot(aes(resid)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot dei residui nel tempo.
ggplot(pcp_resids) +
geom_point(aes(seasonYear, resid), alpha = 1/4, show.legend = FALSE)
Guardo ai compositori che hanno residui più grandi di 0.1.
pcp_highres <- pcp_resids %>%
filter(resid > 0.1 | resid < -0.1) %>%
distinct(composerName)
pcp %>%
inner_join(pcp_highres) %>%
ggplot() +
geom_point(aes(x = seasonYear, y = perc),alpha = 1/4) +
facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`
Modelli che fittano abbastanza bene. Compositori con almeno 26 stagioni.
pcp %>%
anti_join(atleast26) %>%
semi_join(
pcp_glance %>%
filter(glance_r.squared > 0.6) %>%
select(composerName)
) %>%
ggplot() +
geom_point(aes(x = seasonYear, y = perc),alpha = 1/4) +
facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`
## Joining with `by = join_by(composerName)`
Compositori presenti in almeno 80 stagioni, con densità 70 %. Sono 25 compositori, responsabili per il 57 % delle performance.
over80 <- performances %>%
group_by(composerName) %>%
mutate(
seasons = n_distinct(seasonYear),
first = min(seasonYear),
last = max(seasonYear),
span = last - first + 1,
density = seasons / span) %>%
filter(seasons > 80 & density > 0.70) %>%
distinct(composerName)
performances %>%
mutate (totali = n()) %>%
inner_join(over80) %>%
mutate (over80 = n(), perc = over80 / totali) %>%
distinct(perc)
## Joining with `by = join_by(composerName)`
## # A tibble: 1 × 1
## perc
## <dbl>
## 1 0.574
Solo gli over 80.
ggplot(
pcp %>%
inner_join(over80)
) +
geom_point(aes(x = seasonYear, y = perc),alpha = 1/4) +
facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`
Simile, più di 40 stagioni, con densità del 70%.
over40 <- performances %>%
group_by(composerName) %>%
mutate(
seasons = n_distinct(seasonYear),
first = min(seasonYear),
last = max(seasonYear),
span = last - first + 1,
density = seasons / span) %>%
filter(seasons >= 40 & density > 0.70) %>%
distinct(composerName)
performances %>%
mutate (totali = n()) %>%
inner_join(over40) %>%
mutate (over40 = n(), perc = over40 / totali) %>%
distinct(perc)
## Joining with `by = join_by(composerName)`
## # A tibble: 1 × 1
## perc
## <dbl>
## 1 0.639
Over 40.
ggplot(
pcp %>%
inner_join(over40)
) +
geom_point(aes(x = seasonYear, y = perc),alpha = 1/4) +
facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`
Gini compositori.
library(ineq)
gini_comp <- pcp %>%
select(seasonYear,n) %>%
group_by(seasonYear) %>%
mutate(gini = Gini(n)) %>%
distinct(seasonYear,gini)
ggplot(gini_comp) +
geom_vline(aes(xintercept = 1909, color = "red"), show.legend = FALSE) +
geom_line(aes(seasonYear,gini))
Un altro subset, compositori rappresentati prima della stagione 1909, per almeno 8 stagioni. Rappresentano il 48 % delle performance totali.
before_1909 <- performances %>%
group_by(composerName) %>%
filter(seasonYear < 1909) %>%
mutate(seasons = n_distinct(seasonYear)) %>%
filter(seasons >= 8) %>%
distinct(composerName)
saveRDS(before_1909, file = "before_1909")
performances %>%
mutate (totali = n()) %>%
inner_join(before_1909) %>%
mutate (before_1909 = n(), perc = before_1909 / totali) %>%
distinct(perc)
## Joining with `by = join_by(composerName)`
## # A tibble: 1 × 1
## perc
## <dbl>
## 1 0.478
Compositori rappresentati prima della stagione 1909-1910 per almeno 8 stagioni.
ggplot(
pcp %>%
inner_join(before_1909)
) +
geom_point(aes(x = seasonYear, y = perc),alpha = 1/4) +
geom_vline(aes(xintercept = 1909, color = "red"), show.legend = FALSE) +
facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`
Numero di compositori per stagione.
performances %>%
group_by(seasonYear) %>%
mutate(composers = n_distinct(composerName)) %>%
ungroup() %>%
ggplot() +
geom_point(aes(seasonYear,composers)) +
geom_vline(aes(xintercept = 1909, color = "red"), show.legend = FALSE)
Stagioni che hanno Gini compositori maggiore di 0.5.
pcp %>%
inner_join(
gini_comp %>%
filter(gini > 0.5)
) %>%
ggplot(aes(seasonYear,perc,color=composerName)) +
geom_point(show.legend = FALSE)
## Joining with `by = join_by(seasonYear)`
Stagioni che hanno Gini compositori maggiore di 0.5. Compositori che nella stagione hanno più dell’10% delle performance.
pcp %>%
inner_join(
gini_comp %>%
filter(gini > 0.5)
) %>%
filter(perc > 0.1) %>%
ggplot(aes(seasonYear,perc,color=composerName)) +
geom_point(show.legend = TRUE)
## Joining with `by = join_by(seasonYear)`
Mediana percentuali per stagione.
pcp %>%
group_by(seasonYear) %>%
mutate(median = median(perc)) %>%
ggplot() +
geom_line(aes(seasonYear,median)) +
geom_vline(aes(xintercept = 1909, color = "red"), show.legend = FALSE) +
annotate("text", x = 1917, y = 0.06, label="1909", angle=0)
Si possono individuare nei trend all’interno dell’evoluzione dei repertori dei conduttori? Modello quadratico.
ccn2 <- performances %>%
filter(!is.na(conductorName)) %>%
filter(conductorName != "Not conducted") %>%
group_by(seasonYear,conductorName,composerName) %>%
mutate(comp_cond_season = n()) %>%
group_by(seasonYear,conductorName) %>%
mutate(cond_season = n()) %>%
mutate(perc = comp_cond_season / cond_season) %>%
group_by(conductorName,composerName) %>%
mutate(seasonYear_2 = seasonYear**2) %>%
ungroup() %>%
distinct(seasonYear,seasonYear_2,conductorName,composerName,perc)
saveRDS(ccn2, file = "ccn2.RDS")
ccn2_nested <- ccn2 %>%
group_by(conductorName,composerName) %>%
nest()
ccn2_func <- function(df) {
lm(perc ~ seasonYear + seasonYear_2, data = df)
}
regs <- ccn2_nested %>%
mutate(
model = map(data, ccn2_func),
tidied = map(model, tidy),
glanced = map(model, glance),
augmented = map(model, augment)
)
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `tidied = map(model, tidy)`.
## ℹ In group 7377: `conductorName = "Stahlberg, Fritz"`, `composerName =
## "Stahlberg, Fritz"`.
## Caused by warning in `summary.lm()`:
## ! essentially perfect fit: summary may be unreliable
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 2 remaining warnings.
saveRDS(regs, file = "regs.RDS")
Modelli “molto buoni” e “buoni”.
very_good <- regs %>%
unnest(glanced) %>%
filter(r.squared >= 0.9) %>%
select(conductorName, composerName)
good <- regs %>%
unnest(glanced) %>%
filter(r.squared >= 0.8) %>%
select(conductorName, composerName)
Come sono distribuiti i residui.
regs %>%
unnest(augmented) %>%
select(.resid) %>%
ggplot() +
geom_histogram(aes(.resid))
## Adding missing grouping variables: `conductorName`, `composerName`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Seleziono quelli più alti di 0.2
highres <- regs %>%
unnest(augmented) %>%
filter(.resid > 0.2 | .resid < -0.2) %>%
distinct(conductorName,composerName,seasonYear,.resid)
Direttori-compositori con fit buono per coppie che compaiono almeno per 5 stagioni.
ccn2 %>%
group_by(conductorName,composerName) %>%
mutate(seasons = n_distinct(seasonYear)) %>%
filter(seasons >= 5) %>%
inner_join(good) %>%
ggplot() +
geom_line(aes(seasonYear,perc,color=conductorName), show.legend = FALSE) +
geom_point(aes(seasonYear,perc,color=conductorName), show.legend = FALSE) +
facet_wrap(~conductorName)
## Joining with `by = join_by(conductorName, composerName)`
Bernstein, fit buono. Almeno tre stagioni.
ccn2 %>%
group_by(conductorName,composerName) %>%
mutate(seasons = n_distinct(seasonYear)) %>%
filter(seasons > 2) %>%
filter(conductorName == "Bernstein, Leonard") %>%
inner_join(good) %>%
ggplot() +
geom_line(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
geom_point(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
facet_wrap(~composerName)
## Joining with `by = join_by(conductorName, composerName)`
Bernstein non nel fit buono. Almeno tre stagioni
ccn2 %>%
group_by(conductorName,composerName) %>%
mutate(seasons = n_distinct(seasonYear)) %>%
filter(seasons > 2) %>%
filter(conductorName == "Bernstein, Leonard") %>%
anti_join(good) %>%
ggplot() +
geom_line(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
geom_point(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
facet_wrap(~composerName)
## Joining with `by = join_by(conductorName, composerName)`
Masur, fit buono. Almeno tre stagioni.
ccn2 %>%
group_by(conductorName,composerName) %>%
mutate(seasons = n_distinct(seasonYear)) %>%
filter(seasons > 2) %>%
filter(conductorName == "Masur, Kurt") %>%
inner_join(good) %>%
ggplot() +
geom_line(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
geom_point(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
facet_wrap(~composerName)
## Joining with `by = join_by(conductorName, composerName)`
Masur, non nel fit buono. Almeno tre stagioni.
ccn2 %>%
group_by(conductorName,composerName) %>%
mutate(seasons = n_distinct(seasonYear)) %>%
filter(seasons > 2) %>%
filter(conductorName == "Masur, Kurt") %>%
anti_join(good) %>%
ggplot() +
geom_line(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
geom_point(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
facet_wrap(~composerName)
## Joining with `by = join_by(conductorName, composerName)`
Boulez, fit buono. Almeno tre stagioni.
ccn2 %>%
group_by(conductorName,composerName) %>%
mutate(seasons = n_distinct(seasonYear)) %>%
filter(seasons > 2) %>%
filter(conductorName == "Boulez, Pierre") %>%
inner_join(good) %>%
ggplot() +
geom_line(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
geom_point(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
facet_wrap(~composerName)
## Joining with `by = join_by(conductorName, composerName)`
Boulez, non nel fit buono. Almeno tre stagioni.
ccn2 %>%
group_by(conductorName,composerName) %>%
mutate(seasons = n_distinct(seasonYear)) %>%
filter(seasons > 2) %>%
filter(conductorName == "Boulez, Pierre") %>%
anti_join(good) %>%
ggplot() +
geom_line(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
geom_point(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
facet_wrap(~composerName)
## Joining with `by = join_by(conductorName, composerName)`
remove(ccn2)
remove(regs)
Modello lineare. Per ogni direttore-compositore modello, in base all’anno, la variazione della percentuale delle performance di quel compositore nelle performance totali del direttore nell’anno.
ccn <- performances %>%
filter(!is.na(conductorName)) %>%
filter(conductorName != "Not conducted") %>%
group_by(seasonYear,conductorName,composerName) %>%
mutate(comp_cond_season = n()) %>%
group_by(seasonYear,conductorName) %>%
mutate(cond_season = n()) %>%
mutate(perc = comp_cond_season / cond_season) %>%
group_by(conductorName,composerName) %>%
ungroup() %>%
distinct(seasonYear,conductorName,composerName,perc)
saveRDS(ccn, file = "ccn.RDS")
ccn_nested <- ccn %>%
group_by(conductorName,composerName) %>%
nest()
ccn_func <- function(df) {
lm(perc ~ seasonYear, data = df)
}
lin_regs <- ccn_nested %>%
mutate(
model = map(data, ccn_func),
tidied = map(model, tidy),
glanced = map(model, glance),
augmented = map(model, augment)
)
## Warning: There were 15 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `tidied = map(model, tidy)`.
## ℹ In group 3417: `conductorName = "Jansons, Mariss"`, `composerName = "Rossini,
## Gioachino"`.
## Caused by warning in `summary.lm()`:
## ! essentially perfect fit: summary may be unreliable
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 14 remaining warnings.
saveRDS(lin_regs, file = "lin_regs.RDS")
Modelli con R^2 sopra 0.9 e sopra 0.8.
very_good_lin <- lin_regs %>%
unnest(glanced) %>%
filter(r.squared >= 0.9) %>%
select(conductorName, composerName)
good_lin <- lin_regs %>%
unnest(glanced) %>%
filter(r.squared >= 0.8) %>%
select(conductorName, composerName)
Modelli con R^2 sopra 0.8 (coppie direttori-compositori con almeno cinque stagioni).
ccn %>%
group_by(conductorName,composerName) %>%
mutate(seasons = n_distinct(seasonYear)) %>%
filter(seasons >= 5) %>%
inner_join(good_lin) %>%
ggplot() +
geom_line(aes(seasonYear,perc,color=conductorName), show.legend = FALSE) +
geom_point(aes(seasonYear,perc,color=conductorName), show.legend = FALSE) +
facet_wrap(~conductorName)
## Joining with `by = join_by(conductorName, composerName)`
Bernstein modelli lineari positivi e negativi.
BernsteinHeadNeg <- lin_regs %>%
unnest(tidied) %>%
select(conductorName,composerName,term,estimate) %>%
pivot_wider(names_from = term,values_from = estimate) %>%
select(conductorName,composerName,seasonYear) %>%
filter(!is.na(seasonYear)) %>%
filter(conductorName == "Bernstein, Leonard") %>%
filter(seasonYear < 0) %>%
arrange(seasonYear)
saveRDS(BernsteinHeadNeg, file = "BernsteinHeadNeg.RDS")
BernsteinHeadPos <- lin_regs %>%
unnest(tidied) %>%
select(conductorName,composerName,term,estimate) %>%
pivot_wider(names_from = term,values_from = estimate) %>%
select(conductorName,composerName,seasonYear) %>%
filter(!is.na(seasonYear)) %>%
filter(conductorName == "Bernstein, Leonard") %>%
filter(seasonYear > 0) %>%
arrange(-seasonYear)
saveRDS(BernsteinHeadPos, file = "BernsteinHeadPos.RDS")
Bernstein lineari positivi (almeno dieci stagioni).
ccn %>%
group_by(conductorName,composerName) %>%
mutate(seasons = n_distinct(seasonYear)) %>%
filter(seasons >= 10) %>%
inner_join(BernsteinHeadPos, by = c("conductorName","composerName")) %>%
ggplot() +
geom_line(aes(seasonYear.x,perc,color=composerName), show.legend = FALSE) +
geom_point(aes(seasonYear.x,perc,color=composerName), show.legend = FALSE) +
facet_wrap(~composerName)
Bernstein lineari negativi. Almeno cinque stagioni.
ccn %>%
group_by(conductorName,composerName) %>%
mutate(seasons = n_distinct(seasonYear)) %>%
filter(seasons >= 5) %>%
inner_join(BernsteinHeadNeg, by = c("conductorName","composerName")) %>%
ggplot() +
geom_line(aes(seasonYear.x,perc,color=composerName), show.legend = FALSE) +
geom_point(aes(seasonYear.x,perc,color=composerName), show.legend = FALSE) +
facet_wrap(~composerName)
Anthem, legato ai tour.
performances %>%
filter(composerName == "Anthem,") %>%
ggplot() +
geom_point(aes(Date,workTitle,color=eventType), show.legend = FALSE)
Anthem, cresce dal 1957 per i molti tour all’estero e nel 1918 per i tour nazionali.
performances %>%
filter(composerName == "Anthem,") %>%
filter(eventType == "Tour") %>%
ggplot() +
geom_histogram(aes(seasonYear), binwidth = 1) +
geom_vline(aes(xintercept = 1957, color = "red"), show.legend = FALSE) +
geom_vline(aes(xintercept = 1918, color = "red"), show.legend = FALSE)
Tour del 1918 in america, inno americano.
performances %>%
filter(composerName == "Anthem,") %>%
filter(eventType == "Tour") %>%
filter(seasonYear == 1918) %>%
ggplot() +
geom_bar(aes(y = Location, fill = workTitle))
Tour del 1957 nel mondo.
performances %>%
filter(composerName == "Anthem,") %>%
filter(eventType == "Tour") %>%
filter(seasonYear == 1957) %>%
ggplot() +
geom_bar(aes(y = Location, fill = workTitle))
Richard Wagner fino al 1950. Performance e direttori.
performances %>%
filter(composerName == "Wagner, Richard") %>%
filter(seasonYear <= 1950) %>%
ggplot() +
geom_jitter(aes(x = seasonYear, y = conductorName),alpha = 1/4)
Richard Wagner fino al 1950, otto maggiori direttori. Numero di performance e periodo di attività.
top_Wagner <- performances %>%
filter(composerName == "Wagner, Richard") %>%
filter(seasonYear <= 1950) %>%
group_by(seasonYear,conductorName) %>%
count() %>%
group_by(conductorName) %>%
mutate(total = sum(n)) %>%
arrange(-total) %>%
distinct(conductorName, total) %>%
head(8) %>%
mutate(group = conductorName)
colorBlindBlack8 <- c("#000000", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
performances %>%
filter(composerName == "Wagner, Richard") %>%
filter(seasonYear <= 1950) %>%
inner_join(top_Wagner) %>%
ggplot() +
geom_area(aes(seasonYear,fill=conductorName), stat = "bin", binwidth = 1, show.legend = TRUE) +
scale_fill_manual(values=colorBlindBlack8) +
facet_wrap(~conductorName)
## Joining with `by = join_by(conductorName)`
Richard Wagner. Direttore Josef Stransky.
performances %>%
filter(composerName == "Wagner, Richard") %>%
filter(!is.na(conductorName)) %>%
filter(seasonYear <= 1925) %>%
mutate(
group = ifelse(conductorName == "Stransky, Josef", "Stransky, Josef", "Altri conduttori")
) %>%
ggplot() +
geom_bar(aes(seasonYear,fill=group))
Aaron Copland dal 1950 al 1995. Performance e direttori.
performances %>%
filter(composerName == "Copland, Aaron") %>%
filter(seasonYear >= 1950 & seasonYear <= 1995) %>%
ggplot() +
geom_jitter(aes(x = seasonYear, y = conductorName),alpha = 1/4)
Aaron Copland dal 1950 al 1995, otto maggiori direttori. Numero di performance e periodo di attività.
top_Copland <- performances %>%
filter(composerName == "Copland, Aaron") %>%
filter(seasonYear >= 1950 & seasonYear <= 1995) %>%
group_by(seasonYear,conductorName) %>%
count() %>%
group_by(conductorName) %>%
mutate(total = sum(n)) %>%
arrange(-total) %>%
distinct(conductorName, total) %>%
head(8) %>%
mutate(group = conductorName)
performances %>%
filter(composerName == "Copland, Aaron") %>%
filter(seasonYear >= 1950 & seasonYear <= 1995) %>%
inner_join(top_Copland) %>%
ggplot() +
geom_area(aes(seasonYear,fill=conductorName), stat = "bin", binwidth = 1, show.legend = TRUE) +
scale_fill_manual(values=colorBlindBlack8) +
facet_wrap(~conductorName)
## Joining with `by = join_by(conductorName)`